library(tidyverse)
library(leaflet)
library(ggmap)

Reduce number of categories

Number of categories were reduced from 39 to 15 by combining common crime types under one category. This reduction in detail is to help gain quick insights while plotting the data. It is not expected to significantly affect the analysis.

Categories <- read_csv("Categories.csv")

incidents_house_price dataset

Load the incidents_house_price dataset from the previous data cleaning exercise.

incidents_house_price <- read_csv("incidents_house_price.csv")

Reduce categories in incidents_house_price dataset

incidents_new_categories <- left_join(incidents_house_price, Categories, by = "Category") %>%
  select(-n) %>%
  rename(New_Category = `New Category`) 

Extract only features useful for plotting on a map

zipc <- incidents_new_categories %>% 
  select(zipcode, New_Category, latitude, longitude) %>%
  count(zipcode, New_Category, latitude, longitude) %>%
  arrange(desc(n))
filt_zipc <- zipc %>% filter(n > 10) %>%
  mutate(leaflet_labels = paste0(New_Category, " (", n, ")")) %>%
  mutate(leaflet_radius = findInterval(n, c(50, 100, 200, 300, 400, 500, 600, 
                                            700, 800, 900, 1000, 2000, 5000, 10000)))

filt_zipc$New_Category <- as.factor(filt_zipc$New_Category)

Number of crimes by category

The number of crimes for each category are shown on a map of San Francisco using the leaflet library. This provides a spatial visualization of areas with high concentration of crimes and crime-types. The tool tip indicates the category followed by the number of crimes in parenthesis ().

col_pal <- colorFactor(palette = "magma", levels = 
                         levels(filt_zipc$New_Category))

leaflet() %>%
  setView(lng = -122.4164, lat = 37.7766, zoom = 12) %>%
  addTiles() %>%
  addCircleMarkers(filt_zipc, lng = filt_zipc$longitude, lat = filt_zipc$latitude, 
                   weight = 5, radius = filt_zipc$leaflet_radius * 1.5, fillOpacity = 0.8,
                   color = col_pal(filt_zipc$New_Category),
                   label = filt_zipc$leaflet_labels) %>%
  addLegend("topright", col_pal, values =  filt_zipc$New_Category)

In addition to the plot above, crimes can also be visualized by category. As an illustration, the plots below show the spatial distribution of crimes in the top 2 and bottom 2 categories.

sf_map <- get_map(location = "San Francisco", maptype = "roadmap", zoom = 12)
sf <- ggmap(sf_map)

plot1 <- incidents_new_categories %>%
  filter(New_Category %in% c("THEFT", "ARSON", "SEXUAL OFFENSES", "WEAPON")) %>%
  select(zipcode, New_Category, latitude, longitude) %>%
  count(zipcode, New_Category, latitude, longitude) %>%
  arrange(desc(n))

sf +
  geom_point(data = plot1 %>% filter (n > 20), 
             aes(x = longitude, y = latitude), color = "red",
             alpha = 0.9, size = 0.1) +
  facet_grid(~ New_Category)

Number of crimes for each category

The bar plot below provides a relative comparison of the number of crimes for each category.

incidents_new_categories  %>%
  ggplot(aes(x = New_Category)) +
  geom_bar() + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_y_continuous(labels=function(n){format(n, scientific = FALSE)},
                     breaks = seq(0,500000,by = 25000)) + 
  labs(x = "Categories of crime", y = "Number of crimes") + 
  coord_flip()

Incidents by day

The pie chart below is to assess the variation in the number of crimes by day of the week.

incidents_by_day <- incidents_new_categories %>%
                    select(DayOfWeek, New_Category) %>%
                    count(DayOfWeek) 
pie <- incidents_by_day %>% 
       ggplot(aes(x = factor(1), y = n, fill = DayOfWeek)) + 
       geom_bar(width = 1,stat="identity", color = "black") + 
       guides(fill=guide_legend(override.aes=list(colour=NA))) +
       coord_polar(theta = "y") + 
       theme(axis.ticks=element_blank(),  
             axis.title=element_blank(),  
             axis.text.y=element_blank()) 

y.breaks <- cumsum(incidents_by_day$n) - incidents_by_day$n/2

pie + 
  theme(axis.text.x=element_text(color='black')) +
  scale_y_continuous(breaks=y.breaks, labels=incidents_by_day$DayOfWeek) + 
  geom_text(aes(y=y.breaks, label = n), size=3)

Incidents by month and year

The heatmap below shows the number of crimes by month and year. It allows for easy visualization of hot-spots, i.e., month-year combinations with high crime-rates.

incidents_by_month <- incidents_new_categories %>%
                     select(year, month) %>%
                     count(year, month)
incidents_by_month %>%
  mutate_at(c("month"), as.factor) %>%
  mutate_at(c("year"), as.factor) %>%
  filter(n > 7000) %>%
  ggplot(aes(x = month, y = year)) +
    geom_tile(aes(fill = n)) + 
    scale_fill_gradient(low = "white", high = "darkred") +
    labs(x = "Months", y = "Number of crimes") + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0)) + 
    scale_x_discrete(labels=month.name) 

Number of incidents resolved by each PD district

The bar plot below shows the number of incidents resolved for each PD district. The visualization allows for easy comparison of the relative resolution of the number of cases for each district.

resolution <- incidents_new_categories %>%
  select(New_Category, PdDistrict,Resolution) %>%
  mutate(Resolved = ifelse(Resolution == "NONE" , 0, 1))
resolution %>%
  mutate(Resolved = ifelse(Resolved == 0, "N", "Y")) %>%
  mutate_at(c("Resolved"), .funs = as.factor) %>%
  ggplot(aes(x = Resolved)) + 
  geom_bar(aes(color = Resolved, fill = Resolved)) + 
  facet_grid(~PdDistrict) +
  scale_fill_manual(values = c("DarkRed", "DarkGreen")) +
  scale_color_manual(values = c("DarkRed", "DarkGreen"))  +
  theme_bw() +
  theme(legend.position="none")

Trend of incidents vs time

The number of incidents for the top six most frequent categories occuring hourly, during a 24 hour period, are shown in the plot below. The Time axis corresponds to the 24-hour clock time. Each datapoint corresponds to the sum total of all the crimes between the years 2003-2018, for the corresponding category and time.

incidents_new_categories$Time <- incidents_new_categories$Time %>%
  substr(1,2) %>% as.numeric()

incidents_new_categories %>%
  filter(New_Category %in% c("THEFT", "ASSAULT", "ARSON", "BURGLARY", "DRUG/ALCOHOL", "VEHICLE THEFT")) %>%
  group_by(New_Category, Time) %>%
  summarise(n = n()) %>%
  ggplot(aes(x = Time, y = n)) + 
  geom_point(aes(color = New_Category)) + 
  geom_line(aes(group = New_Category, color = New_Category)) 

For each datapoint in the plot above, the plot below provides the breakdown by year.

incidents_new_categories %>%
  filter(New_Category %in% c("THEFT", "ASSAULT", "ARSON", "BURGLARY", "DRUG/ALCOHOL", "VEHICLE THEFT"),
         year %in% 2003:2017) %>%
  group_by(year, New_Category, Time) %>%
  summarise(n = n()) %>%
  group_by(New_Category, Time) %>%
  ggplot(aes(x = Time, y = n, fill = New_Category)) +
  geom_bar(stat = "identity", width=1.0, color = "black") + 
  facet_wrap(~year)

Crimes vs House Prices

The plot below shows the variation of median house prices and the number of crimes for each PdDistrict. It is to visualize the impact of the crime rate on real estate prices.

incidents_by_price <- incidents_new_categories %>%
  group_by(PdDistrict) %>%
  mutate(median_prices = median(pricepersqft, na.rm = TRUE),
         n_crimes = n()) %>%
  select(PdDistrict, median_prices, n_crimes)%>%
  distinct()
  
incidents_by_price %>% 
  ggplot(aes(x = PdDistrict)) + 
  geom_point(aes(y = n_crimes, color = "Crimes"), size = 2) +
  geom_line(aes(y = n_crimes, group = 1, color = "Crimes"), size = 1) +
  geom_point(aes(y = median_prices * 500, color = "Property prices"), size = 2) +
  geom_line(aes(y = median_prices * 500, group = 1, color = "Property prices"), size = 1) +
  scale_y_continuous(sec.axis = sec_axis(~. *0.002, name = "Property Prices [per sqft]"),
                     labels=function(n){format(n, scientific = FALSE)},
                     breaks = seq(0,500000,by = 25000)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        plot.title = element_text(hjust = 0.5)) + 
  scale_colour_manual(values = c("red", "blue")) + 
  labs(title = "Crimes Vs Property prices", y = "Number of Crimes", 
       x = "San Francisco Areas", colour = "Parameter")